# Using read instead of fread because fread have some troubles with handling NA values
library(readr)
posting <- read_csv("postings.csv")
## Rows: 123849 Columns: 31
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (17): company_name, title, description, pay_period, location, formatted_...
## dbl (14): job_id, max_salary, company_id, views, med_salary, min_salary, app...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#View(posting)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ purrr 1.0.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(scales)
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
Include original_list_time, expiry and closed_time. All time are in UNIX time in millisecond
posting_clean <- posting %>%
select(job_id, company_name, title, max_salary, med_salary, min_salary, pay_period, location, company_id, formatted_work_type,original_listed_time, expiry, closed_time, applies, remote_allowed, application_type, formatted_experience_level, sponsored, work_type, currency, zip_code, fips)
First, we will see if there are any completely duplicated rows, to see if there are any duplicates in the data retrieval processes. Note that we will also include job_id when checking for completely duplicated rows, because if the same job has two different ids, that job is posted twice, In this case, we will check how often the same jobs are posted more than once, and we will decide what to do with it accordingly
#View(posting_clean)
duplicated_posting_clean <- posting_clean[duplicated(posting_clean) | duplicated(posting_clean, fromLast = TRUE), ]
duplicated_posting_clean
## # A tibble: 0 × 22
## # ℹ 22 variables: job_id <dbl>, company_name <chr>, title <chr>,
## # max_salary <dbl>, med_salary <dbl>, min_salary <dbl>, pay_period <chr>,
## # location <chr>, company_id <dbl>, formatted_work_type <chr>,
## # original_listed_time <dbl>, expiry <dbl>, closed_time <dbl>, applies <dbl>,
## # remote_allowed <dbl>, application_type <chr>,
## # formatted_experience_level <chr>, sponsored <dbl>, work_type <chr>,
## # currency <chr>, zip_code <chr>, fips <chr>
posting_clean_no_id <- posting_clean %>%
select(-job_id, -company_id, -original_listed_time, -expiry, -closed_time)
#View(posting_clean_no_id)
duplicated_posting_clean_no_id <- posting_clean_no_id[duplicated(posting_clean_no_id) | duplicated(posting_clean_no_id, fromLast = TRUE), ]
duplicated_posting_clean_no_id
## # A tibble: 10,045 × 17
## company_name title max_salary med_salary min_salary pay_period location
## <chr> <chr> <dbl> <dbl> <dbl> <chr> <chr>
## 1 City of Tempe Deputy Ci… 274206 NA 205398 YEARLY Tempe, …
## 2 SKF Group Technician NA NA NA <NA> Plymout…
## 3 SKF Group Heat Trea… 27.9 NA 19.6 HOURLY Muskego…
## 4 SKF Group Quality E… NA NA NA <NA> Lansdal…
## 5 SKF Group Maintenan… NA NA NA <NA> Sumter,…
## 6 SKF Group Manufactu… NA NA NA <NA> Dexter,…
## 7 SKF Group Regional … NA NA NA <NA> Cincinn…
## 8 SKF Group Machining… NA NA NA <NA> Muskego…
## 9 SKF Group CNC Machi… NA NA NA <NA> Hanover…
## 10 SKF Group Maintenan… NA NA NA <NA> Ladson,…
## # ℹ 10,035 more rows
## # ℹ 10 more variables: formatted_work_type <chr>, applies <dbl>,
## # remote_allowed <dbl>, application_type <chr>,
## # formatted_experience_level <chr>, sponsored <dbl>, work_type <chr>,
## # currency <chr>, zip_code <chr>, fips <chr>
Since with job_id included, there were no duplicates, and upon inspection of the duplicated rows without the id columns, it is safe to assume that all these duplicated rows without ids are separate job posting, or are the result of companies posting more than once for the same positions, or are changes to the job posting and that they post another job after making those changes to the posting. Due to these facts, we will be keeping all of the duplicated rows without ids, since they are not really duplicates. Now, we will go on to inspect the data closer.
summary(posting_clean)
## job_id company_name title max_salary
## Min. :9.217e+05 Length:123849 Length:123849 Min. : 1
## 1st Qu.:3.895e+09 Class :character Class :character 1st Qu.: 48
## Median :3.902e+09 Mode :character Mode :character Median : 80000
## Mean :3.896e+09 Mean : 91939
## 3rd Qu.:3.905e+09 3rd Qu.: 140000
## Max. :3.906e+09 Max. :120000000
## NA's :94056
## med_salary min_salary pay_period location
## Min. : 0.0 Min. : 1 Length:123849 Length:123849
## 1st Qu.: 18.9 1st Qu.: 37 Class :character Class :character
## Median : 25.5 Median : 60000 Mode :character Mode :character
## Mean : 22015.6 Mean : 64911
## 3rd Qu.: 2510.5 3rd Qu.: 100000
## Max. :750000.0 Max. :85000000
## NA's :117569 NA's :94056
## company_id formatted_work_type original_listed_time
## Min. : 1009 Length:123849 Min. :1.702e+12
## 1st Qu.: 14352 Class :character 1st Qu.:1.713e+12
## Median : 226965 Mode :character Median :1.713e+12
## Mean : 12204012 Mean :1.713e+12
## 3rd Qu.: 8047188 3rd Qu.:1.713e+12
## Max. :103472979 Max. :1.714e+12
## NA's :1717
## expiry closed_time applies remote_allowed
## Min. :1.713e+12 Min. :1.712e+12 Min. : 1.00 Min. :1
## 1st Qu.:1.715e+12 1st Qu.:1.713e+12 1st Qu.: 1.00 1st Qu.:1
## Median :1.716e+12 Median :1.713e+12 Median : 3.00 Median :1
## Mean :1.716e+12 Mean :1.713e+12 Mean : 10.59 Mean :1
## 3rd Qu.:1.716e+12 3rd Qu.:1.713e+12 3rd Qu.: 8.00 3rd Qu.:1
## Max. :1.729e+12 Max. :1.714e+12 Max. :967.00 Max. :1
## NA's :122776 NA's :100529 NA's :108603
## application_type formatted_experience_level sponsored work_type
## Length:123849 Length:123849 Min. :0 Length:123849
## Class :character Class :character 1st Qu.:0 Class :character
## Mode :character Mode :character Median :0 Mode :character
## Mean :0
## 3rd Qu.:0
## Max. :0
##
## currency zip_code fips
## Length:123849 Length:123849 Length:123849
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
str(posting_clean)
## tibble [123,849 × 22] (S3: tbl_df/tbl/data.frame)
## $ job_id : num [1:123849] 921716 1829192 10998357 23221523 35982263 ...
## $ company_name : chr [1:123849] "Corcoran Sawyer Smith" NA "The National Exemplar" "Abrams Fensterman, LLP" ...
## $ title : chr [1:123849] "Marketing Coordinator" "Mental Health Therapist/Counselor" "Assitant Restaurant Manager" "Senior Elder Law / Trusts and Estates Associate Attorney" ...
## $ max_salary : num [1:123849] 20 50 65000 175000 80000 20 300000 120000 NA NA ...
## $ med_salary : num [1:123849] NA NA NA NA NA NA NA NA NA 350 ...
## $ min_salary : num [1:123849] 17 30 45000 140000 60000 14 60000 90000 NA NA ...
## $ pay_period : chr [1:123849] "HOURLY" "HOURLY" "YEARLY" "YEARLY" ...
## $ location : chr [1:123849] "Princeton, NJ" "Fort Collins, CO" "Cincinnati, OH" "New Hyde Park, NY" ...
## $ company_id : num [1:123849] 2774458 NA 64896719 766262 NA ...
## $ formatted_work_type : chr [1:123849] "Full-time" "Full-time" "Full-time" "Full-time" ...
## $ original_listed_time : num [1:123849] 1.71e+12 1.71e+12 1.71e+12 1.71e+12 1.71e+12 ...
## $ expiry : num [1:123849] 1.72e+12 1.72e+12 1.72e+12 1.72e+12 1.72e+12 ...
## $ closed_time : num [1:123849] NA NA NA NA NA NA NA NA NA NA ...
## $ applies : num [1:123849] 2 NA NA NA NA 4 1 NA NA NA ...
## $ remote_allowed : num [1:123849] NA NA NA NA NA NA 1 NA NA NA ...
## $ application_type : chr [1:123849] "ComplexOnsiteApply" "ComplexOnsiteApply" "ComplexOnsiteApply" "ComplexOnsiteApply" ...
## $ formatted_experience_level: chr [1:123849] NA NA NA NA ...
## $ sponsored : num [1:123849] 0 0 0 0 0 0 0 0 0 0 ...
## $ work_type : chr [1:123849] "FULL_TIME" "FULL_TIME" "FULL_TIME" "FULL_TIME" ...
## $ currency : chr [1:123849] "USD" "USD" "USD" "USD" ...
## $ zip_code : chr [1:123849] "08540" "80521" "45202" "11040" ...
## $ fips : chr [1:123849] "34021" "08069" "39061" "36059" ...
Formatting to universal currency: USD Since we will need to format
other columns such as max, med, min salary, pay period, we need to
tackle the currency column first We know that there are multiple
currency. First, we will factor currency to find out how
many levels it has, and its distribution Since this is the data set of
LinkedIn job posting in the United States, we can safely assume that if
the currency is not specified, it is USD. To support this theory, we can
plot the distribution of other currencies, and we can observe that NA
and USD is the default. We will created a subset data where the currency
is neither USD or NA to see how many observation of other currency we
have
posting_clean$currency <- factor(posting_clean$currency)
levels(posting_clean$currency)
## [1] "AUD" "BBD" "CAD" "EUR" "GBP" "USD"
Factoring currency
# Using a log scale for y to visualize other currency
ggplot(posting_clean, aes(x = currency, y = ..count..)) +
geom_bar() +
scale_y_log10(labels = scales::comma)
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Filtering the data for odd currency
posting_clean_odd_currency <- posting_clean %>%
filter(!is.na(posting_clean$currency)) %>%
filter(!currency == "USD")
# Making a bar chart of the distribution of odd currency
ggplot(posting_clean_odd_currency, aes(x = currency, y = ..count..)) +
geom_bar()
Since there are only a handful of odd currency, we can safely assume that these are outliers, so the default currency is USD. Now, we will repopulate the currency columns for USD value in place of NA’s.
# Converting into string text again before repopulate
posting_clean$currency <- as.character(posting_clean$currency)
# Repopulating the currency column
posting_clean <- posting_clean %>%
mutate(currency = ifelse(is.na(currency), "USD", currency))
# Factoring the currencies again
posting_clean$currency <- factor(posting_clean$currency)
# Using a log scale for y to visualize other currency
ggplot(posting_clean, aes(x = currency, y = ..count..)) +
geom_bar() +
scale_y_log10(labels = scales::comma)
Now, we will normalized other columns that is affected by currency like salary and pay rate. First, we have to figure out how many types of pay there are
ggplot(posting_clean, aes(x = pay_period, y = ..count..)) +
geom_bar() +
scale_y_log10(labels = scales::comma)
Since NA pay takes quite a large proportion, we can assume that jobs that don’t have a specified pay period don’t include min/med/max salaries. We check this
posting_clean_pay_null <- posting_clean %>%
filter(is.na(pay_period))
posting_clean_pay_null
## # A tibble: 87,776 × 22
## job_id company_name title max_salary med_salary min_salary pay_period
## <dbl> <chr> <chr> <dbl> <dbl> <dbl> <chr>
## 1 1218575 Children's Nebra… Resp… NA NA NA <NA>
## 2 9615617 Glastender, Inc. Insi… NA NA NA <NA>
## 3 11009123 PGAV Destinations Proj… NA NA NA <NA>
## 4 56482768 <NA> Appa… NA NA NA <NA>
## 5 56924323 <NA> Stru… NA NA NA <NA>
## 6 69333422 Staffing Theory Seni… NA NA NA <NA>
## 7 83789755 OsteoStrong Oste… NA NA NA <NA>
## 8 111513530 United Methodist… Cont… NA NA NA <NA>
## 9 115639136 Shannon Waltchack Cont… NA NA NA <NA>
## 10 117675818 Premier Family C… Phys… NA NA NA <NA>
## # ℹ 87,766 more rows
## # ℹ 15 more variables: location <chr>, company_id <dbl>,
## # formatted_work_type <chr>, original_listed_time <dbl>, expiry <dbl>,
## # closed_time <dbl>, applies <dbl>, remote_allowed <dbl>,
## # application_type <chr>, formatted_experience_level <chr>, sponsored <dbl>,
## # work_type <chr>, currency <fct>, zip_code <chr>, fips <chr>
posting_clean_pay_null %>% filter(!is.na(max_salary) | !is.na(min_salary) | !is.na(med_salary))
## # A tibble: 0 × 22
## # ℹ 22 variables: job_id <dbl>, company_name <chr>, title <chr>,
## # max_salary <dbl>, med_salary <dbl>, min_salary <dbl>, pay_period <chr>,
## # location <chr>, company_id <dbl>, formatted_work_type <chr>,
## # original_listed_time <dbl>, expiry <dbl>, closed_time <dbl>, applies <dbl>,
## # remote_allowed <dbl>, application_type <chr>,
## # formatted_experience_level <chr>, sponsored <dbl>, work_type <chr>,
## # currency <fct>, zip_code <chr>, fips <chr>
There are no instances where expected salaries (max/med/min salary) is mentioned without the pay period, so we can rest assure Now, we need to convert expected salary to a normalized format. In this case, we will choose annually as the normalised format, since it is the easiest to visualize how much they are making a year, and it is much easier to convert from annually to any other format.
posting_clean <- posting_clean %>%
mutate(norm_max_salary = case_when(
is.na(pay_period) ~ NA_real_,
pay_period == "HOURLY" ~ max_salary * 40 * 52,
pay_period == "WEEKLY" ~ max_salary * 52,
pay_period == "BIWEEKLY" ~ max_salary * 26,
pay_period == "MONTHLY" ~ max_salary * 12,
pay_period == "YEARLY" ~ max_salary,
TRUE ~ NA_real_
))
posting_clean <- posting_clean %>%
mutate(norm_med_salary = case_when(
is.na(pay_period) ~ NA_real_,
pay_period == "HOURLY" ~ med_salary * 40 * 52,
pay_period == "WEEKLY" ~ med_salary * 52,
pay_period == "BIWEEKLY" ~ med_salary * 26,
pay_period == "MONTHLY" ~ med_salary * 12,
pay_period == "YEARLY" ~ med_salary,
TRUE ~ NA_real_
))
posting_clean <- posting_clean %>%
mutate(norm_min_salary = case_when(
is.na(pay_period) ~ NA_real_,
pay_period == "HOURLY" ~ min_salary * 40 * 52,
pay_period == "WEEKLY" ~ min_salary * 52,
pay_period == "BIWEEKLY" ~ min_salary * 26,
pay_period == "MONTHLY" ~ min_salary * 12,
pay_period == "YEARLY" ~ min_salary,
TRUE ~ NA_real_
))
Populating the norm_med_salary column based on max and min salary if NA
posting_clean <- posting_clean %>%
mutate(norm_med_salary = ifelse(is.na(norm_med_salary), (norm_max_salary + norm_min_salary) / 2, norm_med_salary))
Converting currency
posting_clean <- posting_clean %>%
mutate(norm_med_salary = case_when(
is.na(norm_med_salary) ~ NA_real_,
currency == "AUD" ~ norm_med_salary * 0.6184,
currency == "BBD" ~ norm_med_salary * 0.5,
currency == "CAD" ~ norm_med_salary * 0.682,
currency == "EUR" ~ norm_med_salary * 1.1,
currency == "GBP" ~ norm_med_salary * 1.3,
currency == "USD" ~ norm_med_salary,
TRUE ~ NA_real_
))
posting_clean <- posting_clean %>%
mutate(norm_max_salary = case_when(
is.na(norm_max_salary) ~ NA_real_,
currency == "AUD" ~ norm_max_salary * 0.6184,
currency == "BBD" ~ norm_max_salary * 0.5,
currency == "CAD" ~ norm_max_salary * 0.682,
currency == "EUR" ~ norm_max_salary * 1.1,
currency == "GBP" ~ norm_max_salary * 1.3,
currency == "USD" ~ norm_max_salary,
TRUE ~ NA_real_
))
posting_clean <- posting_clean %>%
mutate(norm_min_salary = case_when(
is.na(norm_min_salary) ~ NA_real_,
currency == "AUD" ~ norm_min_salary * 0.6184,
currency == "BBD" ~ norm_min_salary * 0.5,
currency == "CAD" ~ norm_min_salary * 0.682,
currency == "EUR" ~ norm_min_salary * 1.1,
currency == "GBP" ~ norm_min_salary * 1.3,
currency == "USD" ~ norm_min_salary,
TRUE ~ NA_real_
))
Converting original_listed_time, expiry, closed_time to date
posting_clean$original_listed_time <- as.POSIXct(posting_clean$original_listed_time / 1000, origin = "1970-01-01", tz = "UTC")
posting_clean$expiry <- as.POSIXct(posting_clean$expiry / 1000, origin = "1970-01-01", tz = "UTC")
posting_clean$closed_time <- as.POSIXct(posting_clean$closed_time / 1000, origin = "1970-01-01", tz = "UTC")
Inspect amount of null values per col
colSums(is.na(posting_clean))
## job_id company_name
## 0 1719
## title max_salary
## 0 94056
## med_salary min_salary
## 117569 94056
## pay_period location
## 87776 0
## company_id formatted_work_type
## 1717 0
## original_listed_time expiry
## 0 0
## closed_time applies
## 122776 100529
## remote_allowed application_type
## 108603 0
## formatted_experience_level sponsored
## 29409 0
## work_type currency
## 0 0
## zip_code fips
## 20872 27415
## norm_max_salary norm_med_salary
## 94056 87776
## norm_min_salary
## 94056
Percentage of null value per col
round(colSums(is.na(posting_clean)) / nrow(posting_clean) * 100, 2)
## job_id company_name
## 0.00 1.39
## title max_salary
## 0.00 75.94
## med_salary min_salary
## 94.93 75.94
## pay_period location
## 70.87 0.00
## company_id formatted_work_type
## 1.39 0.00
## original_listed_time expiry
## 0.00 0.00
## closed_time applies
## 99.13 81.17
## remote_allowed application_type
## 87.69 0.00
## formatted_experience_level sponsored
## 23.75 0.00
## work_type currency
## 0.00 0.00
## zip_code fips
## 16.85 22.14
## norm_max_salary norm_med_salary
## 75.94 70.87
## norm_min_salary
## 75.94
We will drop all the locations where both zips and fips is null, we will then later repopulate the location column based on zip code or fips code
posting_clean <- posting_clean %>%
mutate(location = ifelse(is.na(zip_code) & is.na(fips), NA, location))
Separate location into city and state
posting_clean <- posting_clean %>%
separate(location, into = c("city", "state"), sep = ",")
Drop old max/med/min salary cols, currency col
posting_clean <- posting_clean %>%
select(-max_salary, -min_salary, -med_salary, -currency)
Relocate cols for easier cleaning
posting_clean <- posting_clean %>%
relocate(norm_max_salary, norm_med_salary, norm_min_salary, .before = pay_period)
posting_clean <- posting_clean %>%
relocate(zip_code, fips, .after = state)
Loading in the zip_code_database
zip_code_database <- read_csv("zip_code_database.csv")
## Rows: 42735 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): zip, type, primary_city, acceptable_cities, unacceptable_cities, s...
## dbl (4): decommissioned, latitude, longitude, irs_estimated_population
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
zip_code_database_clean <- zip_code_database %>%
select(zip, primary_city, state, latitude, longitude)
Adding a suffix to zip_code_database_clean
zip_code_database_clean <- zip_code_database_clean %>%
rename_with(~ paste0(.x, ".zipcode"))
Left joining posting_clean with zip_code_database_clean
posting_clean <- posting_clean %>%
left_join(zip_code_database_clean, by = c("zip_code" = "zip.zipcode"))
Populating city and state using zip_code First, we need to find out how much observation where city or state is null while zip_code or fips is not
posting_clean_city_or_state_null <- posting_clean %>%
filter(is.na(city) | is.na(state))
nrow(posting_clean_city_or_state_null)
## [1] 20872
posting_zip_and_fips_not_null <- posting_clean_city_or_state_null %>%
filter(!is.na(zip_code) & !is.na(fips))
nrow(posting_zip_and_fips_not_null)
## [1] 0
There is no way to populate those columns, all instances where city or state is null is due to zip_code & fips being null We can removes the columns that we don’t need
posting_clean <- posting_clean %>%
select(-primary_city.zipcode, -state.zipcode, -latitude.zipcode, -longitude.zipcode)
Let see the distribution of states:
posting_clean %>%
filter(!is.na(state)) %>%
ggplot(aes(x = reorder(factor(state), desc(factor(state))))) +
geom_bar(width = 0.8) +
coord_flip() +
scale_y_continuous(expand = expansion(mult = c(0, 0.05))) +
theme_minimal() +
theme(axis.text.y = element_text(size = 11)) +
labs(x = "State", y = "Count")
We can confirm that there is no outliers in the state columns, and the format of state is correct
Reading the industries and job_industries tables into posting
industry <- read_csv("industries.csv")
## Rows: 422 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): industry_name
## dbl (1): industry_id
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
job_industry <- read_csv("job_industries.csv")
## Rows: 164808 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): job_id, industry_id
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#posting_clean <- posting_clean %>%
# left_join(job_industry, by = "job_id")
Left joining will create duplicates on the posting_clean because 1 job_id can have more than 1 industry_id, so we will aggregate the job_industry by job_id So first, we will have to join job_industry with industry on industry_id, doing this will create a job_industry table with job_id, industry_id, and industry_name. After that, we can aggregate the industry_name grouped by job_id, this means a job can have multiple industries, and our posting_clean table will not create any duplicated posting.
job_industry <- job_industry %>%
left_join(industry, by = "industry_id")
job_industry_agg <- job_industry %>%
group_by(job_id) %>%
summarise(industries_name = paste(industry_name, collapse = ", "))
Now, we can left join posting_clean with job_industry_agg and we won’t have any duplicated rows of job postings, while keeping all the industries_name for further analysis
posting_clean <- posting_clean %>%
left_join(job_industry_agg, by = "job_id") %>%
relocate(industries_name, .after = title)
Now, we want categorize the industries_name down to smaller groups for further analysis, because as of right now, there are over 3,300+ unique industries.
Attempt 1: Methods: We will try to find out which are the most common keywords of each industries, and the we can use that to try to categorize all the industries based on common keywords Search for top 50 keywords
Attempt 2: Expand the searching parameters to 100 most common instead
Attempt 3: Increase the size to 200
library(tidytext)
top50_keywords <- posting_clean %>%
unnest_tokens(word, industries_name) %>%
anti_join(stop_words, by = "word") %>% # Remove common stop words
count(word, sort = TRUE) %>%
top_n(50, n)
#View(top50_keywords)
top100_keywords <- posting_clean %>%
unnest_tokens(word, industries_name) %>%
anti_join(stop_words, by = "word") %>% # Remove common stop words
count(word, sort = TRUE) %>%
top_n(100, n)
#View(top100_keywords)
top200_keywords <- posting_clean %>%
unnest_tokens(word, industries_name) %>%
anti_join(stop_words, by = "word") %>% # Remove common stop words
count(word, sort = TRUE) %>%
top_n(200, n)
#View(top200_keywords)
#temporary_test <- top200_keywords %>%
# select(word)
#temporary_test2 <- read_csv("industry_keywords_by_field.csv")
#temporary_test3 <- temporary_test %>%
# semi_join(temporary_test2, by = c("word" = "Keyword"))
#tempurary_test4 <- temporary_test3 %>%
# inner_join(temporary_test2, by = c("word" = "Keyword"))
Mapping out individual keywords
posting_clean[grepl("Defense and space", posting_clean$industries_name, ignore.case = TRUE), ]
## # A tibble: 1,446 × 23
## job_id company_name title industries_name norm_max_salary norm_med_salary
## <dbl> <chr> <chr> <chr> <dbl> <dbl>
## 1 3803052628 Insight Glo… Flig… Defense and Sp… 170000 155000
## 2 3815847830 Northrop Gr… Staf… Defense and Sp… 241400 201200
## 3 3850493836 Crown Point… Audi… Defense and Sp… NA NA
## 4 3884433891 Becker Wrig… Hill… Appliances, El… 125000 112500
## 5 3884436032 ITRS Recrui… Cost… Aviation and A… NA NA
## 6 3884436140 North Star … FPGA… Defense and Sp… NA NA
## 7 3884436258 AirBorn Inc. Seni… Defense and Sp… NA NA
## 8 3884437283 Pegasus Ste… Qual… Defense and Sp… NA NA
## 9 3884437415 JW Industri… Staf… Aviation and A… 160000 140000
## 10 3884438364 The Davis C… Exec… Defense and Sp… NA 29
## # ℹ 1,436 more rows
## # ℹ 17 more variables: norm_min_salary <dbl>, pay_period <chr>, city <chr>,
## # state <chr>, zip_code <chr>, fips <chr>, company_id <dbl>,
## # formatted_work_type <chr>, original_listed_time <dttm>, expiry <dttm>,
## # closed_time <dttm>, applies <dbl>, remote_allowed <dbl>,
## # application_type <chr>, formatted_experience_level <chr>, sponsored <dbl>,
## # work_type <chr>
#posting_clean[grepl("advertising", posting_clean$industries_name, ignore.case = TRUE), ]
#top200_keywords_test2 <- top200_keywords %>%
# left_join(tempurary_test4, by = "word")
Now, I want to see what the leading industries are, lets convert the current table into long format and aggregate by industries_name
posting_clean_long <- posting_clean %>%
separate_rows(industries_name, sep = ",") %>%
mutate(industries_name = str_trim(industries_name))
Grouping by industries_name
posting_clean_long_agg <- posting_clean_long %>%
count(industries_name, sort = TRUE)
nrow(posting_clean_long_agg)
## [1] 424
Since there are about only 400 industries, we don’t have to group them down to bigger sectors Instead, let’s see the distribution of these industries
ggplot(posting_clean_long_agg, aes(x = n)) +
geom_histogram() +
scale_x_log10(labels = scales::comma) +
labs(x = "Distribution of amount of jobs per industries")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Top50 industries
posting_clean_long_agg %>%
filter(!is.na(industries_name)) %>%
slice_max(n, n = 100) %>%
ggplot(aes(x = reorder(industries_name, n), y = n)) +
geom_bar(stat = "identity") +
coord_flip() +
theme(axis.text.x = element_text(size = 5))
Let see job demand from 2023-2024 First, lets create a month-year column for data aggregation
No time series available, because we are missing a lot of data
posting_clean_long_yearmonth <- posting_clean_long %>%
mutate(yearmonth = as.Date(format(original_listed_time, "%Y-%m-01"))) %>%
relocate(yearmonth, .after = original_listed_time)
posting_clean_long %>%
count(industries_name, sort = TRUE) %>%
slice_max(n, n = 100) %>%
pull(industries_name)
## [1] "Hospitals and Health Care"
## [2] "Retail"
## [3] "IT Services and IT Consulting"
## [4] "Staffing and Recruiting"
## [5] "Financial Services"
## [6] "Software Development"
## [7] "Manufacturing"
## [8] "Construction"
## [9] "Banking"
## [10] "Technology"
## [11] "Insurance"
## [12] "Hospitality"
## [13] "Pharmaceutical Manufacturing"
## [14] "Real Estate"
## [15] "Telecommunications"
## [16] "Non-profit Organizations"
## [17] "Industrial Machinery Manufacturing"
## [18] "Biotechnology Research"
## [19] "Motor Vehicle Manufacturing"
## [20] "Food and Beverage Services"
## [21] "Advertising Services"
## [22] "Business Consulting and Services"
## [23] "Accounting"
## [24] "Higher Education"
## [25] "Medical Equipment Manufacturing"
## [26] "Government Administration"
## [27] "Information and Internet"
## [28] "Wellness and Fitness Services"
## [29] "Defense and Space Manufacturing"
## [30] NA
## [31] "Logistics"
## [32] "Supply Chain and Storage"
## [33] "Transportation"
## [34] "Food and Beverage Manufacturing"
## [35] "Retail Apparel and Fashion"
## [36] "Civil Engineering"
## [37] "Electrical"
## [38] "Appliances"
## [39] "and Electronics Manufacturing"
## [40] "Medical Practices"
## [41] "Information Services"
## [42] "Oil and Gas"
## [43] "Information and Media"
## [44] "Environmental Services"
## [45] "Law Practice"
## [46] "Utilities"
## [47] "Aviation and Aerospace Component Manufacturing"
## [48] "Chemical Manufacturing"
## [49] "Restaurants"
## [50] "Wholesale Building Materials"
## [51] "Education Administration Programs"
## [52] "Truck Transportation"
## [53] "Computer and Network Security"
## [54] "Entertainment Providers"
## [55] "Engineering Services"
## [56] "Investment Management"
## [57] "Legal Services"
## [58] "Consumer Services"
## [59] "Mental Health Care"
## [60] "Human Resources Services"
## [61] "Research Services"
## [62] "Facilities Services"
## [63] "Information Technology & Services"
## [64] "Investment Banking"
## [65] "Civic and Social Organizations"
## [66] "Machinery Manufacturing"
## [67] "Primary and Secondary Education"
## [68] "Wholesale"
## [69] "Airlines and Aviation"
## [70] "Packaging and Containers Manufacturing"
## [71] "Architecture and Planning"
## [72] "Semiconductor Manufacturing"
## [73] "Automation Machinery Manufacturing"
## [74] "Marketing Services"
## [75] "Security and Investigations"
## [76] "Individual and Family Services"
## [77] "Professional Services"
## [78] "Veterinary Services"
## [79] "Design Services"
## [80] "Computer Hardware Manufacturing"
## [81] "Public Safety"
## [82] "Services for Renewable Energy"
## [83] "Broadcast Media Production and Distribution"
## [84] "Public Relations and Communications Services"
## [85] "Travel Arrangements"
## [86] "Health and Human Services"
## [87] "Personal Care Product Manufacturing"
## [88] "Retail Office Equipment"
## [89] "Freight and Package Transportation"
## [90] "Business Content"
## [91] "Computers and Electronics Manufacturing"
## [92] "Venture Capital and Private Equity Principals"
## [93] "Renewable Energy Semiconductor Manufacturing"
## [94] "Wireless Services"
## [95] "E-Learning Providers"
## [96] "Retail Luxury Goods and Jewelry"
## [97] "Education"
## [98] "Retail Groceries"
## [99] "Mining"
## [100] "Warehousing and Storage"
#posting_clean_long_top_3 <- posting_clean_long_yearmonth %>%
# filter(industries_name %in% top3_industries) %>%
# count(yearmonth, industries_name)
#ggplot(posting_clean_long_top_3, aes(x = yearmonth, y = n, color = industries_name)) +
# geom_line() +
# scale_y_log10()
Total jobs posed on linkedIn
posting_clean_long_yearmonth_total <- posting_clean_long_yearmonth %>%
count(yearmonth)
Distribution of job posting my month
ggplot(posting_clean_long_yearmonth, aes(x = yearmonth, y = after_stat(count))) +
geom_bar() +
scale_y_log10(labels = scales::comma)
For our data set, most job posting happends in March - April, we will use this knowledge to construct a choropleth map based on all of the job posted during these periods
library(maps)
##
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
##
## map
library(ggthemes)
#posting_clean_long$state <- trimws(posting_clean_long$state)
#head(match(posting_clean_long$state, state.abb))
#posting_clean_long$state_full <- tolower(state.name[match(posting_clean_long$state, state.abb)])
posting_clean$state <- trimws(posting_clean$state)
head(match(posting_clean$state, state.abb))
## [1] 30 6 35 32 15 33
posting_clean$state_full <- tolower(state.name[match(posting_clean$state, state.abb)])
us_map_data <- map_data("state")
state_posting <- posting_clean %>%
count(state_full) %>%
filter(!is.na(state_full))
#sum(state_posting$n)
state_posting_map_joined <- left_join(us_map_data, state_posting, by = c("region" = "state_full"))
ggplot(state_posting_map_joined, aes(x = long, y = lat, group = group, fill = n)) +
geom_polygon(color = "white") +
coord_fixed(1.3) +
theme_map() +
scale_fill_gradient2(
low = "red",
mid = "grey",
high = "forestgreen",
midpoint = median(state_posting_map_joined$n, na.rm = TRUE)
)
Let see the distribution of the amount ob jobs acros state first
ggplot(state_posting, aes(x = state_full, y = n)) +
geom_bar(stat = "identity") +
coord_flip() +
scale_y_log10()
ggplot(state_posting, aes(x = state_full, y = n)) +
geom_bar(stat = "identity") +
coord_flip()
summary(state_posting$n)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 118.0 431.5 1126.0 2039.7 2781.2 11386.0
Let’s try another map with the log10() used
state_posting_logn <- state_posting %>%
mutate(logn = log10(n))
state_posting_logn_map_joined <- left_join(us_map_data, state_posting_logn, by = c("region" = "state_full"))
ggplot(state_posting_logn_map_joined, aes(x = long, y = lat, group = group, fill = logn)) +
geom_polygon(color = "white") +
coord_fixed(1.3) +
theme_map() +
scale_fill_gradient2(
low = "firebrick",
mid = "grey90",
high = "darkgreen",
midpoint = mean(state_posting_logn_map_joined$logn, na.rm = TRUE)
) +
theme(legend.position = "none")
Let see distribution of jobs in the tech industries Because a job can be in multiple industries, and we are using the long format data, I will be dropping any duplicated job posting on job_id, because they most likely are in the same industries anyway
tech_industries <- c("IT Services and IT Consulting", "Software Development", "Technology", "Information and Internet", "Information and Media",
"Information Services", "Computer and Network Security", "Information Technology & Services")
posting_clean_long %>%
count(industries_name, sort = TRUE) %>%
slice_max(n, n = 100) %>%
pull(industries_name)
## [1] "Hospitals and Health Care"
## [2] "Retail"
## [3] "IT Services and IT Consulting"
## [4] "Staffing and Recruiting"
## [5] "Financial Services"
## [6] "Software Development"
## [7] "Manufacturing"
## [8] "Construction"
## [9] "Banking"
## [10] "Technology"
## [11] "Insurance"
## [12] "Hospitality"
## [13] "Pharmaceutical Manufacturing"
## [14] "Real Estate"
## [15] "Telecommunications"
## [16] "Non-profit Organizations"
## [17] "Industrial Machinery Manufacturing"
## [18] "Biotechnology Research"
## [19] "Motor Vehicle Manufacturing"
## [20] "Food and Beverage Services"
## [21] "Advertising Services"
## [22] "Business Consulting and Services"
## [23] "Accounting"
## [24] "Higher Education"
## [25] "Medical Equipment Manufacturing"
## [26] "Government Administration"
## [27] "Information and Internet"
## [28] "Wellness and Fitness Services"
## [29] "Defense and Space Manufacturing"
## [30] NA
## [31] "Logistics"
## [32] "Supply Chain and Storage"
## [33] "Transportation"
## [34] "Food and Beverage Manufacturing"
## [35] "Retail Apparel and Fashion"
## [36] "Civil Engineering"
## [37] "Electrical"
## [38] "Appliances"
## [39] "and Electronics Manufacturing"
## [40] "Medical Practices"
## [41] "Information Services"
## [42] "Oil and Gas"
## [43] "Information and Media"
## [44] "Environmental Services"
## [45] "Law Practice"
## [46] "Utilities"
## [47] "Aviation and Aerospace Component Manufacturing"
## [48] "Chemical Manufacturing"
## [49] "Restaurants"
## [50] "Wholesale Building Materials"
## [51] "Education Administration Programs"
## [52] "Truck Transportation"
## [53] "Computer and Network Security"
## [54] "Entertainment Providers"
## [55] "Engineering Services"
## [56] "Investment Management"
## [57] "Legal Services"
## [58] "Consumer Services"
## [59] "Mental Health Care"
## [60] "Human Resources Services"
## [61] "Research Services"
## [62] "Facilities Services"
## [63] "Information Technology & Services"
## [64] "Investment Banking"
## [65] "Civic and Social Organizations"
## [66] "Machinery Manufacturing"
## [67] "Primary and Secondary Education"
## [68] "Wholesale"
## [69] "Airlines and Aviation"
## [70] "Packaging and Containers Manufacturing"
## [71] "Architecture and Planning"
## [72] "Semiconductor Manufacturing"
## [73] "Automation Machinery Manufacturing"
## [74] "Marketing Services"
## [75] "Security and Investigations"
## [76] "Individual and Family Services"
## [77] "Professional Services"
## [78] "Veterinary Services"
## [79] "Design Services"
## [80] "Computer Hardware Manufacturing"
## [81] "Public Safety"
## [82] "Services for Renewable Energy"
## [83] "Broadcast Media Production and Distribution"
## [84] "Public Relations and Communications Services"
## [85] "Travel Arrangements"
## [86] "Health and Human Services"
## [87] "Personal Care Product Manufacturing"
## [88] "Retail Office Equipment"
## [89] "Freight and Package Transportation"
## [90] "Business Content"
## [91] "Computers and Electronics Manufacturing"
## [92] "Venture Capital and Private Equity Principals"
## [93] "Renewable Energy Semiconductor Manufacturing"
## [94] "Wireless Services"
## [95] "E-Learning Providers"
## [96] "Retail Luxury Goods and Jewelry"
## [97] "Education"
## [98] "Retail Groceries"
## [99] "Mining"
## [100] "Warehousing and Storage"
posting_clean[grepl("Technology", posting_clean$industries_name, ignore.case = TRUE), ]
## # A tibble: 5,327 × 24
## job_id company_name title industries_name norm_max_salary norm_med_salary
## <dbl> <chr> <chr> <chr> <dbl> <dbl>
## 1 175485704 GOYT Soft… Technology, In… NA NA
## 2 2269442456 navXcom Comp… Space Research… NA NA
## 3 2558399667 I.T. Soluti… Vali… Pharmaceutical… 145600 135200
## 4 2974397965 Lynx Systems Mark… Information Te… 85000 80000
## 5 3736684097 Cira Tek, I… Prog… Information Te… NA NA
## 6 3739478140 Merakris Th… Dist… Biotechnology … NA NA
## 7 3811513595 Cage Riot Vide… Technology, In… NA 41600
## 8 3813645405 Tenazx Inc Data… Technology, In… NA NA
## 9 3829184899 SanLuna LLC Soci… Technology, In… NA NA
## 10 3835806869 vMOX Head… Telecommunicat… 200000 175000
## # ℹ 5,317 more rows
## # ℹ 18 more variables: norm_min_salary <dbl>, pay_period <chr>, city <chr>,
## # state <chr>, zip_code <chr>, fips <chr>, company_id <dbl>,
## # formatted_work_type <chr>, original_listed_time <dttm>, expiry <dttm>,
## # closed_time <dttm>, applies <dbl>, remote_allowed <dbl>,
## # application_type <chr>, formatted_experience_level <chr>, sponsored <dbl>,
## # work_type <chr>, state_full <chr>
Filtering for job positing within the tech industries
posting_clean_long_tech <- posting_clean_long %>%
filter(industries_name %in% tech_industries)
sum(duplicated(posting_clean_long_tech$job_id))
## [1] 4924
posting_clean_long_tech_unique_id <- posting_clean_long_tech %>%
filter(!duplicated(job_id))
sum(duplicated(posting_clean_long_tech_unique_id$job_id))
## [1] 0
Cleaning the new dataset
posting_clean_long_tech_unique_id$state <- trimws(posting_clean_long_tech_unique_id$state)
posting_clean_long_tech_unique_id$state_full <- tolower(state.name[match(posting_clean_long_tech_unique_id$state, state.abb)])
posting_clean_agg_tech_unique_id <- posting_clean_long_tech_unique_id %>%
count(state_full) %>%
filter(!is.na(state_full))
state_tech_posting_map_joined <- left_join(us_map_data, posting_clean_agg_tech_unique_id, by = c("region" = "state_full"))
state_tech_posting_map_joined$logn <- log10(state_tech_posting_map_joined$n)
ggplot(state_tech_posting_map_joined, aes(x = long, y = lat, group = group, fill = logn)) +
geom_polygon(color = "white") +
coord_fixed(1.3) +
theme_map() +
scale_fill_gradient2(
low = "firebrick",
mid = "grey90",
high = "darkgreen",
midpoint = mean(state_tech_posting_map_joined$logn, na.rm = TRUE)
) +
theme(legend.position = "none")
ggplot(state_tech_posting_map_joined, aes(x = long, y = lat, group = group, fill = logn)) +
geom_polygon(color = "white") +
coord_fixed(1.3) +
theme_map() +
scale_fill_gradient(
low = "grey",
high = "darkgreen"
) +
theme(legend.position = "none")
ggplot(state_tech_posting_map_joined, aes(x = long, y = lat, group = group, fill = logn)) +
geom_polygon(color = "white") +
coord_fixed(1.3) +
theme_map() +
scale_fill_gradient2(
low = "grey",
mid = "#97B197",
high = "darkgreen",
midpoint = mean(state_tech_posting_map_joined$logn, na.rm = TRUE)
) +
theme(legend.position = "none")
Let’s load the data for graduates in the United States
library(readxl)
degree_dataset <- read_excel("degree_dataset.xlsx")
#View(degree_dataset)
Convert it to long format
degree_long <- degree_dataset %>%
pivot_longer(
cols = 2:ncol(degree_dataset),
names_to = "graduate_year",
values_to = "total_graduate"
)
ggplot(degree_long, aes(x = factor(graduate_year), y = total_graduate, group = `Field of study`)) +
geom_line() +
theme(axis.text.x = element_text(angle = 30, vjust = 0.6))
Top 10 and highlight
degree_long_top10 <- degree_long %>%
group_by(`Field of study`) %>%
summarise(total_per_field = sum(total_graduate)) %>%
arrange(total_per_field) %>%
slice_max(total_per_field, n = 10) %>%
pull(`Field of study`)
Because the graduate years are no on a continuous scale, we will have to convert int normal years first
degree_long <- degree_long %>%
mutate(graduate_year = gsub(pattern = "-..$", replacement = "", x = graduate_year)) %>%
mutate(graduate_year = as.numeric(graduate_year) + 1)
Growing Fields
degree_long %>%
filter(`Field of study` %in% degree_long_top10) %>%
mutate(highlight = ifelse(`Field of study` %in% c("Business", "Health professions and related programs", "Engineering", "Biological and biomedical sciences", "Computer and information sciences and support services", "Psychology"), "highlight", "normal"),
highlight2 = case_when(
`Field of study` == "Business" ~ "Business",
`Field of study` == "Health professions and related programs" ~ "Healthcare",
`Field of study` == "Engineering" ~ "Engineering",
`Field of study` == "Computer and information sciences and support services" ~ "Computer Science",
`Field of study` == "Biological and biomedical sciences" ~ "Biomedical Science",
`Field of study` == "Psychology" ~ "Psychology",
TRUE ~ "Others"
)) %>%
ggplot( aes(x = graduate_year, y = total_graduate, group = `Field of study`, color = highlight2, alpha = highlight)) +
geom_line(linewidth = 0.8) +
theme_classic() +
theme(axis.text.x = element_text(angle = 30, vjust = 0.6)) +
scale_alpha_manual(values = c("highlight" = 1, "normal" = 0.3)) +
labs(x = "Years", color = "Field of Study", title = "Top Growing Degrees") +
scale_color_manual(values = c("#16317DFF" ,"#007E2FFF", "#00B7A7FF", "#B86092FF", "#A40000FF","#F39C12", "grey"),
breaks = c("Business", "Healthcare", "Biomedical Science", "Engineering", "Psychology", "Computer Science")) +
theme(#legend.position = "none",
axis.title.y.left = element_blank(),
plot.title = element_text(hjust = 0.5, size = 20)) +
guides(alpha = "none") +
scale_y_continuous(breaks = seq(0, 400000, 100000),
labels = c("0", "100k", "200k", "300k", "400k")) +
theme(panel.grid.major.y = element_line(),
axis.line.y.left = element_blank())
degree_long %>%
mutate(highlight = ifelse(`Field of study` %in% c("Business", "Health professions and related programs", "Engineering", "Biological and biomedical sciences", "Computer and information sciences and support services", "Psychology"), "highlight", "normal"),
highlight2 = case_when(
`Field of study` == "Business" ~ "Business",
`Field of study` == "Health professions and related programs" ~ "Healthcare",
`Field of study` == "Engineering" ~ "Engineering",
`Field of study` == "Computer and information sciences and support services" ~ "Computer Science",
`Field of study` == "Biological and biomedical sciences" ~ "Biomedical Science",
`Field of study` == "Psychology" ~ "Psychology",
TRUE ~ "Others"
)) %>%
ggplot( aes(x = graduate_year, y = total_graduate, group = `Field of study`, color = highlight2, alpha = highlight)) +
geom_line(linewidth = 0.8) +
theme_classic() +
theme(axis.text.x = element_text(angle = 30, vjust = 0.6)) +
scale_alpha_manual(values = c("highlight" = 1, "normal" = 0.2)) +
labs(x = "Years", color = "Field of Study", title = "Top Growing Degrees") +
scale_color_manual(values = c("#16317DFF" ,"#007E2FFF", "#00B7A7FF", "#B86092FF", "#A40000FF","#F39C12", "grey"),
breaks = c("Business", "Healthcare", "Biomedical Science", "Engineering", "Psychology", "Computer Science")) +
theme(#legend.position = "none",
axis.title.y.left = element_blank(),
plot.title = element_text(hjust = 0.5, size = 20)) +
guides(alpha = "none")
ind_posting <- read_csv("Indeed_Postings.csv")
## Rows: 1827 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (1): IHLIDXUS
## date (1): observation_date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ind_posting <- ind_posting %>%
rename(Value = IHLIDXUS)
ggplot(ind_posting, aes(x = observation_date, y = Value)) +
geom_line(linewidth = 1, color = "#5C90B2") +
ylim(-5, NA) +
scale_x_date(date_breaks = "1 year", date_labels = "%Y-%b") +
theme_classic() +
theme(axis.title.x = element_blank(),
axis.title.y = element_blank(),
axis.line.y.left = element_blank(),
panel.grid.major.y = element_line(),
axis.line.x.bottom = element_blank(),
axis.ticks.x.bottom = element_blank(),
axis.text.x = element_blank(),
plot.title = element_text(size = 20),
plot.caption = element_text(color = adjustcolor("black", alpha.f = 0.5))) +
geom_hline(yintercept = 0, color = "black") +
annotate("rect", xmin = as.Date(min(ind_posting$observation_date), format = "%Y-%m-%d"), xmax = as.Date("2023-05-11", format = "%Y-%m-%d"), ymin = 0, ymax = max(ind_posting$Value) + 5, fill = "red", alpha = 0.08) +
annotate("text", x = as.Date(min(ind_posting$observation_date), format = "%Y-%m-%d"), y = -5, label = "2020-03-28", size = 3.2, hjust = 0.4) +
annotate("text", x = as.Date("2022-04-01", format = "%Y-%m-%d"),y = -5, label = "2022-04-01", size = 3.2) +
annotate("segment", x = as.Date("2022-04-01", format = "%Y-%m-%d"), xend = as.Date("2022-04-01", format = "%Y-%m-%d"), y = 0, yend = max(ind_posting$Value), color = "black", linetype = "dashed") +
annotate("text", x = as.Date("2023-05-11", format = "%Y-%m-%d"), y = -5, label = "2023-05-11", size = 3.2) +
annotate("text", x = as.Date("2024-01-01", format = "%Y-%m-%d"), y = -5, label = "2024-01-01", size = 3.2) +
annotate("text", x = as.Date("2025-01-01", format = "%Y-%m-%d"), y = -5, label = "2025-01-01", size = 3.2) +
labs(title = "Job Postings On Indeed in the United States", caption = "Units: Index, Feb, 1, 2020 = 100, Seasonally Adjusted, Frequency: Daily, 7-Day")
devtools::install_github("UrbanInstitute/urbnmapr")
## WARNING: Rtools is required to build R packages, but is not currently installed.
##
## Please download and install Rtools 4.4 from https://cran.r-project.org/bin/windows/Rtools/.
## Skipping install of 'urbnmapr' from a github remote, the SHA1 (ef9f4488) has not changed since last install.
## Use `force = TRUE` to force installation
library(urbnmapr)
states_sf <- get_urbn_map(map = "states", sf = TRUE)
counties_sf <- get_urbn_map(map = "counties", sf = TRUE)
ggplot(states_sf) +
geom_sf() +
coord_sf(crs=st_crs(4326)) +
theme_map()
posting_clean_fips_agrr <- posting_clean %>%
filter(!is.na(fips)) %>%
count(fips)
posting_fips_join <- left_join(counties_sf, posting_clean_fips_agrr, by = c("county_fips" = "fips"))
## old-style crs object detected; please recreate object with a recent sf::st_crs()
## Warning in CPL_crs_from_input(x): GDAL Message 1: CRS EPSG:2163 is deprecated.
## Its non-deprecated replacement EPSG:9311 will be used instead. To use the
## original CRS, set the OSR_USE_NON_DEPRECATED configuration option to NO.
library(sf)
## Linking to GEOS 3.13.0, GDAL 3.10.1, PROJ 9.5.1; sf_use_s2() is TRUE
ggplot(posting_fips_join, aes(fill = n)) +
geom_sf() +
coord_sf(crs=st_crs(4326)) +
theme_map()
posting_large <- read_csv("linkedin_job_postings.csv")
## Rows: 1348454 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): job_link, job_title, company, job_location, search_city, search_co...
## lgl (3): got_summary, got_ner, is_being_worked
## dttm (1): last_processed_time
## date (1): first_seen
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
uscities <- read_csv("uscities.csv")
## Rows: 31254 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): city, city_ascii, state_id, state_name, county_fips, county_name, s...
## dbl (6): lat, lng, population, density, ranking, id
## lgl (2): military, incorporated
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
posting_large_us <- posting_large %>%
filter(search_country == "United States")
nrow(posting_large_us)
## [1] 1149342
uscities_clean <- uscities %>%
select(city, state_id, county_fips) %>%
mutate(location = paste(city, state_id, sep = ", ")) %>%
select(-city, - state_id)
posting_large_us_join <- posting_large_us %>%
left_join(uscities_clean, by = c("job_location" = "location"))
## Warning in left_join(., uscities_clean, by = c(job_location = "location")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 835 of `x` matches multiple rows in `y`.
## ℹ Row 1 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
nrow(posting_large_us_join)
## [1] 1150097
posting_large_aggr <- posting_large_us_join %>%
filter(!is.na(county_fips)) %>%
count(county_fips)
posting_large_fips_join <- left_join(counties_sf, posting_large_aggr, by = "county_fips" )
## old-style crs object detected; please recreate object with a recent sf::st_crs()
ggplot() +
geom_sf(data = posting_large_fips_join, aes(fill = log(n)), linewidth = 0.1) +
coord_sf(crs=st_crs(4326)) +
theme_map() +
theme(legend.position = c(0.2,-0.15),
legend.title = element_blank(),
plot.margin = margin(1,1,50,1)) +
scale_fill_gradientn(
colors = c("lightblue","#FFFF99", "orange", "darkred"),
na.value = "lightblue",
guide = guide_colorbar(barheight = 0.8, barwidth = 20, direction = "horizontal"),
limits = c(min(log(posting_large_fips_join$n), na.rm = TRUE),
max(log(posting_large_fips_join$n), na.rm = TRUE)),
labels = c("0",expression(10^2.5),expression(10^5),expression(10^7.5),expression(10^10))
) +
geom_sf(data = states_sf, color = "black", fill = NA, linewidth = 0.25)
## Warning: A numeric `legend.position` argument in `theme()` was deprecated in ggplot2
## 3.5.0.
## ℹ Please use the `legend.position.inside` argument of `theme()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Coordinate system already present. Adding new coordinate system, which will
## replace the existing one.
## old-style crs object detected; please recreate object with a recent sf::st_crs()
posting_large_fips_join_logn <- posting_large_fips_join %>%
mutate(logn = log(n))
ggplot(posting_large_fips_join_logn, aes(fill = logn)) +
geom_sf() +
coord_sf(crs = st_crs(4326)) +
theme_map() +
theme(legend.position = "right") +
scale_fill_gradient2(
low = "lightblue",
mid = "royalblue",
high = "darkblue",
midpoint = mean(posting_large_fips_join_logn$logn),
na.value = "lightblue"
)
#guide = guide_colorbar(barheight = 10, barwidth = 1)
library(ggalt)
## Registered S3 methods overwritten by 'ggalt':
## method from
## grid.draw.absoluteGrob ggplot2
## grobHeight.absoluteGrob ggplot2
## grobWidth.absoluteGrob ggplot2
## grobX.absoluteGrob ggplot2
## grobY.absoluteGrob ggplot2
salary_aggr_by_lv <- posting_clean_long %>%
filter(!is.na(norm_min_salary) & !is.na(norm_med_salary) & !is.na(norm_max_salary)) %>%
filter(!is.na(industries_name) & !is.na(formatted_experience_level)) %>%
group_by(industries_name, formatted_experience_level) %>%
summarise(med_min = median(norm_min_salary),
med_med = median(norm_med_salary),
med_max = median(norm_max_salary))
## `summarise()` has grouped output by 'industries_name'. You can override using
## the `.groups` argument.
salary_aggr_by_industry <- posting_clean_long %>%
group_by(industries_name) %>%
filter(!is.na(norm_min_salary) & !is.na(norm_med_salary) & !is.na(norm_max_salary)) %>%
filter(!is.na(industries_name)) %>%
summarise(med_min = median(norm_min_salary),
med_med = median(norm_med_salary),
med_max = median(norm_max_salary))
Because we want a somewhat accurate baseline, we will remove all industries with less than 20 mentioned, so that the amount median pay is somewhat reflective of the real mean
unique_industry <- posting_clean_long %>%
count(industries_name) %>%
arrange(n) %>%
filter(n <= 20) %>%
pull(industries_name)
top10_pay_industry <- salary_aggr_by_industry %>%
filter(!industries_name %in% unique_industry) %>%
slice_max(med_med, n = 10) %>%
pull(industries_name)
salary_aggr_by_industry %>%
filter(industries_name %in% top10_pay_industry) %>%
ggplot() +
geom_dumbbell(aes(x = med_min, xend = med_max, y = reorder(industries_name, med_med))) +
geom_point(aes(x = med_med,y = reorder(industries_name, med_med)))
## Warning: Using the `size` aesthetic with geom_segment was deprecated in ggplot2 3.4.0.
## ℹ Please use the `linewidth` aesthetic instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
top20_pay_industry <- salary_aggr_by_industry %>%
filter(!industries_name %in% unique_industry) %>%
slice_max(med_med, n = 20) %>%
pull(industries_name)
salary_aggr_by_industry %>%
filter(industries_name %in% top20_pay_industry) %>%
ggplot() +
geom_dumbbell(aes(x = med_min, xend = med_max, y = reorder(industries_name, med_med))) +
geom_point(aes(x = med_med,y = reorder(industries_name, med_med)))
salary_aggr_by_industry %>%
filter(industries_name %in% top20_pay_industry) %>%
filter(!industries_name %in% c("Movies", "and Sound")) %>%
ggplot() +
geom_segment(aes(x = med_min, y = reorder(industries_name, med_med), xend = med_max, yend = reorder(industries_name, med_med)),
color = "grey",
alpha = 0.8,
linewidth = 4.5) +
geom_point(aes(x = med_min, y = reorder(industries_name, med_med)), color = "red", size = 4.5) +
geom_point(aes(x = med_max, y = reorder(industries_name, med_med)), color = "blue", size = 4.5) +
theme_classic() +
theme(panel.grid.major.x = element_line(),
axis.line.y.left = element_blank(),
axis.ticks.y.left = element_blank(),
axis.ticks.x.bottom = element_blank(),
axis.line.x.bottom = element_blank(),
axis.text.y.left = element_text(size = 12)) +
scale_x_continuous(limit = c(100000, 300000),
breaks = seq(100000, 300000, 50000)) +
scale_y_discrete(labels = c("Videos" = "Videos, Movies, and Sounds"))
It is not worth looking over the levels, because they are not accurately labeled, and do not reflect the truth
bottom10_pay_industry <- salary_aggr_by_industry %>%
filter(!industries_name %in% unique_industry) %>%
slice_min(med_med, n = 10) %>%
pull(industries_name)
salary_aggr_by_industry %>%
filter(industries_name %in% bottom10_pay_industry) %>%
ggplot() +
geom_dumbbell(aes(x = med_min, xend = med_max, y = reorder(industries_name, med_med))) +
geom_point(aes(x = med_med,y = reorder(industries_name, med_med)))
bottom20_pay_industry <- salary_aggr_by_industry %>%
filter(!industries_name %in% unique_industry) %>%
slice_min(med_med, n = 20) %>%
pull(industries_name)
salary_aggr_by_industry %>%
filter(industries_name %in% bottom20_pay_industry) %>%
ggplot() +
geom_dumbbell(aes(x = med_min, xend = med_max, y = reorder(industries_name, med_med))) +
geom_point(aes(x = med_med,y = reorder(industries_name, med_med)))
testing123 <- posting %>%
filter(pay_period == "HOURLY")
testing321 <- posting_clean_long %>%
filter(industries_name == "Hospitals and Health Care")
grad_unemp <- read_csv("CGBD2534.csv")
## Rows: 303 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (1): CGBD2534
## date (1): observation_date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
grad_unemp <- grad_unemp %>%
rename(date = observation_date,
value = CGBD2534)
library(shiny)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
highlight1 <- data.frame(
xmin = as.Date("2001-03-01"),
xmax = as.Date("2001-10-01"),
ymin = 0,
ymax = 11
)
highlight2 <- data.frame(
xmin = as.Date("2008-01-01"),
xmax = as.Date("2009-05-01"),
ymin = 0,
ymax = 11
)
highlight3 <- data.frame(
xmin = as.Date("2020-02-01"),
xmax = as.Date("2020-05-01"),
ymin = 0,
ymax = 11
)
ui <- fluidPage(
titlePanel("College Graduate Unemployment Percentage"),
fluidRow(
column(11,
sliderInput("start_year",
"Select Start Year:",
min = as.numeric(format(min(grad_unemp$date), "%Y")),
max = as.numeric(format(Sys.Date(), "%Y")),
value = as.numeric(format(min(grad_unemp$date), "%Y")),
sep = "",
width = '100%')
)
),
fluidRow(
column(11,
plotlyOutput("linePlot")
)
)
)
server <- function(input, output) {
output$linePlot <- renderPlotly({
start_date <- as.Date(paste0(input$start_year, "-01-01"))
filtered_data <- grad_unemp %>%
filter(date >= start_date)
highlight1_filtered <- highlight1 %>%
filter(xmax >= start_date)
highlight2_filtered <- highlight2 %>%
filter(xmax >= start_date)
highlight3_filtered <- highlight3 %>%
filter(xmax >= start_date)
p <- ggplot() +
geom_line(data = filtered_data, aes(x = date, y = value), color = "#5C90B2") +
labs(title = paste("Data from", input$start_year, "to Present"),
x = "Date",
y = "Unemployment Rate %") +
theme_bw() +
theme(axis.line.y.left = element_blank(),
axis.ticks = element_blank(),
panel.grid.major.x = element_blank()) +
scale_y_continuous(limits = c(0, NA), breaks = seq(0, 10, 1))
if (nrow(highlight1_filtered) > 0) {
p <- p + geom_rect(data = highlight1_filtered,
aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax),
fill = "grey", alpha = 0.5, inherit.aes = FALSE)
}
if (nrow(highlight2_filtered) > 0) {
p <- p + geom_rect(data = highlight2_filtered,
aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax),
fill = "grey", alpha = 0.5, inherit.aes = FALSE)
}
if (nrow(highlight3_filtered) > 0) {
p <- p + geom_rect(data = highlight3_filtered,
aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax),
fill = "grey", alpha = 0.5, inherit.aes = FALSE)
}
ggplotly(p)
})
}
shinyApp(ui = ui, server = server)
grad_unemp %>%
filter(date >= as.Date("2019-01-01")) %>%
ggplot() +
geom_line(aes(x = date, y = value))
grad_unemp %>%
filter(date >= as.Date("2022-01-01")) %>%
ggplot() +
geom_line(aes(x = date, y = value))
salary_aggr_by_industry_count <- posting_clean_long %>%
group_by(industries_name) %>%
filter(!is.na(norm_min_salary) & !is.na(norm_med_salary) & !is.na(norm_max_salary)) %>%
filter(!is.na(industries_name)) %>%
summarise(med_min = median(norm_min_salary),
med_med = median(norm_med_salary),
med_max = median(norm_max_salary),
count = n())
salary_aggr_by_industry_count %>%
filter(med_med > 10000 & count >= 10) %>%
ggplot() +
geom_point(aes(x = med_med, y = count, color = industries_name), stat = "identity") +
theme(legend.position = "none") +
scale_x_log10() +
scale_y_log10()
salary_aggr_by_industry_count %>%
filter(med_med > 10000 & count >= 10) %>%
ggplot() +
geom_point(aes(x = count, y = med_med, color = industries_name), stat = "identity") +
theme(legend.position = "none") +
scale_x_log10() +
scale_y_log10()
salary_aggr_by_industry_count %>%
filter(med_med > 10000 & count >= 10) %>%
plot_ly(
x = ~med_med,
y = ~count,
type = "scatter",
mode = "markers",
text = ~industries_name,
marker = list(size = 10, opacity = 0.7, color = 'steelblue')
) %>%
layout(
xaxis = list(
type = "log",
title = "Median Salary",
tickvals = c(10000, 30000, 100000, 300000, 1000000),
ticktext = c("10K", "30K", "100K", "300K", "1M")
),
yaxis = list(
type = "log",
title = "Count",
tickvals = c(10, 30, 100, 300, 1000, 3000),
ticktext = c("10", "30", "100", "300", "1K", "3K")
),
showlegend = FALSE
)
salary_aggr_by_industry_count %>%
filter(med_med > 10000 & count >= 10) %>%
plot_ly(
x = ~count,
y = ~med_med,
type = "scatter",
mode = "markers",
text = ~industries_name,
marker = list(size = 10, opacity = 0.7, color = 'steelblue')
) %>%
layout(
xaxis = list(
type = "log",
title = "Count",
tickvals = c(10, 30, 100, 300, 1000, 3000),
ticktext = c("10", "30", "100", "300", "1K", "3K")
),
yaxis = list(
type = "log",
title = "Median Salary",
tickvals = c(50000, 70000, 100000, 150000, 200000),
ticktext = c("50K", "70K", "100K", "150k", "200k")
),
showlegend = FALSE
)
salary_aggr_by_industry_count %>%
filter(med_med > 10000 & count >= 10) %>%
plot_ly(
x = ~count,
y = ~med_med,
type = "scatter",
mode = "markers",
text = ~industries_name,
marker = list(size = 10, opacity = 0.7, color = 'steelblue')
) %>%
layout(
xaxis = list(
title = "Count"
),
yaxis = list(
title = "Median Salary"
),
showlegend = FALSE
)
library(shiny)
library(plotly)
library(dplyr)
filtered_industry_posting <- salary_aggr_by_industry_count %>%
filter(med_med > 10000 & count >= 10)
ui <- fluidPage(
titlePanel("Industry Salary vs Count (Log Scale)"),
fluidRow(
column(12,
textInput("industry_search", "Search for an Industry:", "")
)
),
fluidRow(
column(12,
plotlyOutput("plot")
)
)
)
server <- function(input, output) {
output$plot <- renderPlotly({
plot_data <- filtered_industry_posting %>%
mutate(
is_highlighted = grepl(input$industry_search, industries_name, ignore.case = TRUE),
point_color = ifelse(is_highlighted, "red", "steelblue"),
point_size = ifelse(is_highlighted, 10, 7),
point_opacity = ifelse(is_highlighted, 1, 0.6)
)
plot_ly(
data = plot_data,
x = ~count,
y = ~med_med,
type = "scatter",
mode = "markers",
text = ~industries_name,
marker = list(
color = ~point_color,
size = ~point_size,
opacity = ~point_opacity,
line = list(width = 1, color = "white")
)
) %>%
layout(
xaxis = list(
type = "log",
title = "Count",
tickvals = c(10, 30, 100, 300, 1000, 3000),
ticktext = c("10", "30", "100", "300", "1K", "3K")
),
yaxis = list(
type = "log",
title = "Median Salary",
tickvals = c(50000, 70000, 100000, 150000, 200000),
ticktext = c("50K", "70K", "100K", "150K", "200K")
),
showlegend = FALSE
)
})
}
shinyApp(ui = ui, server = server)
library(shiny)
library(plotly)
library(dplyr)
filtered_industry_posting <- salary_aggr_by_industry_count %>%
filter(med_med > 10000 & count >= 10)
ui <- fluidPage(
titlePanel("Expected Salary and Demand"),
fluidRow(
column(12,
textInput("industry_search", "Search for an Industry:", "")
)
),
fluidRow(
column(12,
plotlyOutput("plot")
)
)
)
server <- function(input, output) {
output$plot <- renderPlotly({
plot_data <- filtered_industry_posting %>%
mutate(
is_highlighted = grepl(input$industry_search, industries_name, ignore.case = TRUE),
point_color = ifelse(is_highlighted, "red", "steelblue"),
point_size = ifelse(is_highlighted, 10, 7),
point_opacity = ifelse(is_highlighted, 1, 0.6)
)
plot_ly(
data = plot_data,
x = ~med_med,
y = ~count,
type = "scatter",
mode = "markers",
text = ~industries_name,
marker = list(
color = ~point_color,
size = ~point_size,
opacity = ~point_opacity,
line = list(width = 1, color = "white")
)
) %>%
layout(
xaxis = list(
type = "log",
title = "Median Salary",
tickvals = c(50000, 70000, 100000, 150000, 200000),
ticktext = c("50K", "70K", "100K", "150K", "200K")
),
yaxis = list(
type = "log",
title = "Count",
tickvals = c(10, 30, 100, 300, 1000, 3000),
ticktext = c("10", "30", "100", "300", "1K", "3K")
),
showlegend = FALSE
)
})
}
shinyApp(ui = ui, server = server)
us_unemp <- read_csv("UNRATE.csv")
## Rows: 927 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (1): UNRATE
## date (1): observation_date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
us_unemp <- us_unemp %>%
rename(date = observation_date,
value = UNRATE) %>%
mutate(value = as.numeric(value))
ggplot(us_unemp) +
geom_line(aes(x = date, y = value)) +
ylim(0,NA)
library(shiny)
library(ggplot2)
library(plotly)
library(dplyr)
ui <- fluidPage(
titlePanel("U.S. Unemployment Rates Over Time"),
fluidRow(
column(12,
sliderInput("start_year",
"Select Start Year:",
min = as.numeric(format(min(grad_unemp$date), "%Y")),
max = as.numeric(format(Sys.Date(), "%Y")),
value = as.numeric(format(min(grad_unemp$date), "%Y")),
sep = "",
width = '100%'))),
fluidRow(
column(6,
h4("College Graduates"),
plotlyOutput("linePlot")),
column(6,
h4("US General Population"),
plotlyOutput("linePlot2"))))
server <- function(input, output) {
output$linePlot <- renderPlotly({
start_date <- as.Date(paste0(input$start_year, "-01-01"))
grad_filtered <- grad_unemp %>% filter(date >= start_date)
us_filtered <- us_unemp %>% filter(date >= start_date)
max_y <- max(c(grad_filtered$value, us_filtered$value), na.rm = TRUE)
p <- ggplot(grad_filtered, aes(x = date, y = value)) +
geom_line(color = "#5C90B2") +
labs(title = paste("Data from", input$start_year, "to Present"),
x = "Date", y = "Unemployment Rate %") +
scale_y_continuous(limits = c(0, max_y)) +
theme_bw() +
theme(panel.grid.major.x = element_blank())
ggplotly(p)
})
output$linePlot2 <- renderPlotly({
start_date <- as.Date(paste0(input$start_year, "-01-01"))
grad_filtered <- grad_unemp %>% filter(date >= start_date)
us_filtered <- us_unemp %>% filter(date >= start_date)
max_y <- max(c(grad_filtered$value, us_filtered$value), na.rm = TRUE)
p <- ggplot(us_filtered, aes(x = date, y = value)) +
geom_line(color = "#5C90B2") +
labs(title = paste("Data from", input$start_year, "to Present"),
x = "Date", y = "Unemployment Rate %") +
scale_y_continuous(limits = c(0, max_y)) +
theme_bw() +
theme(panel.grid.major.x = element_blank())
ggplotly(p)
})
}
shinyApp(ui = ui, server = server)